Introduction
RNA-seq was run on 36 ovarian cancer cell lines, each in
singlicate.
All 36 cell lines have 72h cisplatin IC50s determined by Kristin
Adams and Kendra Wendt in the Lang Lab.
Add in links to Lab notebooks for IC50, RNAseq sample prep
DESeq2
To determine genes differentially expressed between cisplatin
sensitive and resistant cell lines, we used the median cisplatin IC50 of
all 36 cell lines as a cut-point, and excluded cell lines within +/- one
standard deviation of the median. These were defined in the metadata
table.
Load count matrix
countmatrix <- as.matrix(read.delim("../star_salmon/salmon.merged.gene_counts.tsv", sep="\t", row.names="gene_id"))
countmatrix <- countmatrix[,-1]
countmatrix2 <- matrix(as.numeric(countmatrix), ncol = ncol(countmatrix), dimnames = list(rownames(countmatrix), colnames(countmatrix)))
countmatrix2 <- round(countmatrix2)
countmatrix2 <- countmatrix2[,rownames(metadata)]
head(countmatrix2)
X16 X29 X14 X22 X21 X25 X20 X19 X28 X23 X27 X18 X31 X30 X17 X15 X2 X32 X13 X6 X11 X36
A1BG 28 21 52 45 48 39 8 66 20 36 0 52 2 4 27 54 0 16 37 26 14 35
A1BG-AS1 62 54 108 80 88 66 38 67 54 67 0 36 15 6 107 82 0 34 80 109 50 67
A1CF 1 2 0 0 0 2 0 0 0 0 0 0 0 0 0 0 7003 1 0 1 0 1
A2M 11 26 1161 0 10 8791 1 1 16 0 0 0 7 23 11639 0 8292 1 11 0 0 5
A2M-AS1 10 0 15 11 99 3 1 23 8 14 34 10 52 8 4 1 2 10 5 10 5 0
A2ML1 0 74 7 47 2 23 2 0 0 6 0 0 0 9 20 0 0 23 2 0 41 10
X10 X5 X8 X9 X24 X33 X34 X35 X26 X1 X12 X4 X3 X7
A1BG 14 30 2 32 4 1 9 14 4 137 30 58 0 0
A1BG-AS1 13 28 18 64 20 9 27 29 1 88 65 107 0 0
A1CF 0 0 0 0 0 0 0 0 0 0 0 0 0 0
A2M 0 38 0 0 0 113 3 89 0 0 11343 0 3 0
A2M-AS1 0 4 4 19 21 26 78 94 2 5 15 2 0 0
A2ML1 0 0 6 0 0 21 4 7 1 0 0 0 0 0
Create DESeqDataSet object dds
dds <- DESeqDataSetFromMatrix(countData = countmatrix2, colData = metadata, design = ~ Subtype + PlatinumSensitivity)
converting counts to integer mode
Warning: some variables in design formula are characters, converting to factors
Pre-filtering
This step removes genes with low expression to increase multiple
comparison power.
keep <- rowSums(counts(dds)) >= 500
dds <- dds[keep,]
nrow(dds)
[1] 15763
Run DESeq2
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
58 rows did not converge in beta, labelled in mcols(object)$betaConv. Use larger maxit argument with nbinomWaldTest
Print DESeq results
res <- results(dds, contrast=c("PlatinumSensitivity", "resistant", "sensitive"))
res
log2 fold change (MLE): PlatinumSensitivity resistant vs sensitive
Wald test p-value: PlatinumSensitivity resistant vs sensitive
DataFrame with 15763 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
A1BG 25.5204 -0.219516891 0.771050 -0.284698733 0.7758750 0.961014
A1BG-AS1 46.1152 0.000262359 0.722143 0.000363305 0.9997101 0.999864
A1CF 206.3747 0.097577283 1.898634 0.051393412 0.9590120 0.995023
A2M 1083.7271 -1.036788003 1.548419 -0.669578303 NA NA
A2M-AS1 16.4917 1.644444544 0.798528 2.059345594 0.0394611 0.483442
... ... ... ... ... ... ...
ZYG11A 420.874 0.0349471 0.579837 0.0602706 0.951940 0.994151
ZYG11B 2150.578 -0.0101554 0.248877 -0.0408048 0.967452 0.995362
ZYX 4329.321 -0.4054512 0.302526 -1.3402200 0.180174 0.711910
ZZEF1 1675.205 0.1541036 0.221902 0.6944659 0.487390 0.883409
ZZZ3 3280.554 0.2194200 0.210781 1.0409865 0.297882 0.803258
write.csv(as.data.frame(res), file = "DESeq.csv")
Filter DESeq2 results for significant genes
Filter res for padj < 0.05 and |log2FC| >= 1.2
res.filtered <- as.data.frame(res) %>%
filter(padj<0.05)%>%
filter(log2FoldChange >= 1.2 | log2FoldChange <= -1.2)
res.filtered
Data QC
PCA plot
pcaData <- plotPCA(vsd, intgroup=c("Subtype", "PlatinumSensitivity","CellLine"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
myColors <- c("#76AB7E", "#63E678", "#1D32FB", "#7B87FD", "#01BD1F", "#E8A426", "#0B7C1D", "#BB19E7")
names(myColors) <- levels(pcaData$Subtype)
colScale <- scale_colour_manual(name = "Subtype",values = myColors)
pca <- ggplot(pcaData, aes(PC1, PC2, color=Subtype, shape=PlatinumSensitivity, label=CellLine)) +
geom_point(size=3) +
geom_text(hjust=0, vjust=0) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed() +
theme_classic() +
colScale
pca
ggsave("pca.pdf", pca)
Saving 7.29 x 4.51 in image

PCA plot groups samples roughly by subtype
Sample-wise correlation
vsd.mod <- vsd.df
colnames(vsd.mod) <- metadata$CellLine[match(colnames(vsd.mod), metadata$files)]
vsd.mod$gene <- row.names(vsd.mod)
corr <- vsd.mod %>%
select(-gene) %>%
cor(method = "spearman")
df.mod <- df
row.names(df.mod) <- metadata$CellLine[match(row.names(df.mod), metadata$files)]
pheatmap(corr, annotation_col=df.mod)

Barnes RF approach?
Contacted PI 10/27/2022 because script is not on github. No response
as of 12/16/2022.
Plot heatmaps to check sample to sample variability
Plotting the top 100 most highly expressed genes:
select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:100]
df <- as.data.frame(colData(dds)[,c("Subtype", "PlatinumSensitivity")])
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
labels_col=colData(dds)[,c("CellLine")],annotation_col=df)

Dendrogram based on gene expression
Pull genes contributing to principal components 1 & 2
TPM <- as.matrix(read.delim("../star_salmon/salmon.merged.gene_tpm.tsv", sep="\t", row.names="gene_id"))
TPM <- TPM[,-1]
TPM <- matrix(as.numeric(TPM), ncol = ncol(TPM), dimnames = list(rownames(TPM), colnames(TPM)))
TPM <- round(TPM)
TPM <- TPM[,rownames(metadata)]
TPM2 <- TPM
colnames(TPM2) <- metadata$CellLine[match(colnames(TPM2), metadata$files)]
TPM.log <- log(TPM+1)
PCA <- prcomp(TPM.log, scale=TRUE)
PCA.mat <- as.data.frame(PCA$x)
PCA.PC1filt <- PCA.mat %>% filter(PC1 < quantile(PCA.mat[,"PC1"], .2)[[1]])
TPM3 <- TPM2[rowSums(TPM2)>1000,]
counts.sc <- t(TPM3)
dist <- dist(counts.sc)
clust <- hclust(dist, method="average")
dend <- as.dendrogram(clust)
par(mar=c(10,2,1,1))
my_colors <- ifelse(metadata$Subtype=="HGSC", "red",
ifelse(metadata$Subtype=="LGSC", "blue",
ifelse(metadata$Subtype=="OCCC", "yellow",
ifelse(metadata$Subtype=="EC", "green",
ifelse(metadata$Subtype=="SCCOHT","purple",
ifelse(metadata$Subtype=="MUC","orange","white"))))))
plot(dend)
colored_bars(colors = my_colors, dend = dend, rowLabels = "Subtype")

I tried a lot of different iterations of this (including: various
cutoffs for variance of genes, genes contributing most to first and
second principal components, more highly expressed genes, TPM vs vst
data, clustering methods), but the fundamental problem is that isogenic
pairs rarely cluster anywhere near each other, even though the PCA
analysis shows this relationship. I don’t know that this is a reliable
method for determining relatedness/subtyping, unless a robust gene set
is developed.
Data visualization
Plot Heatmap of top differentially expressed genes
vsd.df <-as.data.frame(assay(vsd))
pheatmap(vsd.df[rownames(res.filtered),], labels_col=colData(dds)[,c("CellLine")],annotation_col=df, color=colorRampPalette(c("white", "red"))(50))

Heatmap based on TPM
pheatmap(TPM.log[rownames(res.filtered),], labels_col=colData(dds)[,c("CellLine")], annotation_col=df, color=colorRampPalette(c("white", "red"))(10))

Heatmap with mean-centered data
center_scale <- function(x) {
scale(x, scale=FALSE)
}
vsd.meancenter <- apply(vsd.df, 1, center_scale)
vsd.meancenter <-t(vsd.meancenter)
colnames(vsd.meancenter) <- colnames(vsd.df)
vsd.meancenter <- as.data.frame(vsd.meancenter)
color <- colorRampPalette(brewer.pal(11, "PuOr"))(50)
myBreaks <- c(seq(min(vsd.meancenter[rownames(res.filtered),]), 0, length.out=ceiling(50/2) + 1),
seq(max(vsd.meancenter[rownames(res.filtered),])/50, max(vsd.meancenter[rownames(res.filtered),]), length.out=floor(50/2)))
pheatmap(vsd.meancenter[rownames(res.filtered),], labels_col=colData(dds)[,c("CellLine")], color=color, border_color = NA, annotation_col = df, breaks=myBreaks)

Volcano Plots
res.filtered %>%
ggplot(aes(x=log2FoldChange, y=-log10(pvalue), label=rownames(res.filtered))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red"))+
geom_text_repel()

Run Gene Ontology
Re-run DESeq using IC50 as continuous variable
Create DESeqDataSet object dds
dds2 <- DESeqDataSetFromMatrix(countData = countmatrix2, colData = metadata, design = ~ Subtype + IC50)
converting counts to integer mode
Warning: some variables in design formula are characters, converting to factors the design formula contains one or more numeric variables that have mean or
standard deviation larger than 5 (an arbitrary threshold to trigger this message).
Including numeric variables with large mean can induce collinearity with the intercept.
Users should center and scale numeric variables in the design to improve GLM convergence.
Pre-filtering
This step removes genes with low expression to increase multiple
comparison power.
keep <- rowSums(counts(dds2)) >= 100
dds2 <- dds2[keep,]
nrow(dds2)
[1] 18308
Run DESeq2
dds2 <- DESeq(dds2)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
875 rows did not converge in beta, labelled in mcols(object)$betaConv. Use larger maxit argument with nbinomWaldTest
Print DESeq results
res2 <- results(dds2)
res2
log2 fold change (MLE): IC50
Wald test p-value: IC50
DataFrame with 18308 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
A1BG 25.5152 -0.0250373 0.0747816 -0.334805 0.7377723 0.960709
A1BG-AS1 46.1094 -0.0105069 0.0701475 -0.149783 0.8809361 0.986400
A1CF 206.7023 0.0182140 0.1829988 0.099531 0.9207167 0.992385
A2M 1084.1287 -0.2270264 0.1656011 -1.370924 0.1703988 0.746062
A2M-AS1 16.4752 0.1649801 0.0765098 2.156327 0.0310582 0.488080
... ... ... ... ... ... ...
ZYG11A 420.769 0.00554376 0.0576535 0.0961566 0.9233962 0.992451
ZYG11B 2150.386 -0.01173472 0.0244272 -0.4803961 0.6309457 0.939287
ZYX 4329.425 -0.05132687 0.0304173 -1.6874224 0.0915222 0.650209
ZZEF1 1675.076 0.02185766 0.0240438 0.9090750 0.3633105 0.859931
ZZZ3 3280.365 0.02342910 0.0206913 1.1323150 0.2575020 0.810698
write.csv(as.data.frame(res2), file = "DESeq_IC50.csv")
Filter DESeq2 results for significant genes
Filter res for padj < 0.05 and |log2FC| >= 1.2
res.2.filtered <- as.data.frame(res2) %>%
filter(padj<0.05)%>%
filter(log2FoldChange >= 1.2 | log2FoldChange <= -1.2)
res.2.filtered
Data QC
PCA plot
pcaData2 <- plotPCA(vsd2, intgroup=c("Subtype", "PlatinumSensitivity","CellLine"), returnData=TRUE)
percentVar2 <- round(100 * attr(pcaData2, "percentVar"))
pca2 <- ggplot(pcaData2, aes(PC1, PC2, color=Subtype, shape=PlatinumSensitivity, label=CellLine)) +
geom_point(size=3) +
geom_text(hjust=0, vjust=0) +
xlab(paste0("PC1: ",percentVar2[1],"% variance")) +
ylab(paste0("PC2: ",percentVar2[2],"% variance")) +
coord_fixed() +
theme_classic() +
colScale
pca2
ggsave("pca_IC50.pdf", pca2)
Saving 7.29 x 4.51 in image

Data visualization
Plot Heatmap of top differentially expressed genes
vsd2.df <-as.data.frame(assay(vsd2))
color <- colorRampPalette(c("white", "red"))(40)
breaks <- seq(6,12,length.out=40)
pheatmap(vsd2.df[rownames(res.2.filtered),], labels_col=colData(dds2)[,c("CellLine")],annotation_col=df, color=color, breaks=breaks)

Heatmap with mean-centered data
vsd2.meancenter <- apply(vsd2.df, 1, center_scale)
vsd2.meancenter <-t(vsd2.meancenter)
colnames(vsd2.meancenter) <- colnames(vsd2.df)
vsd2.meancenter <- as.data.frame(vsd2.meancenter)
color <- colorRampPalette(brewer.pal(11, "PuOr"))(50)
myBreaks2 <- c(seq(min(vsd2.meancenter[rownames(res.2.filtered),]), 0, length.out=ceiling(50/2) + 1),
seq(max(vsd2.meancenter[rownames(res.2.filtered),])/50, max(vsd2.meancenter[rownames(res.2.filtered),]), length.out=floor(50/2)))
pheatmap(vsd2.meancenter[rownames(res.2.filtered),], labels_col=colData(dds2)[,c("CellLine")], color=color, border_color = NA, annotation_col = df, breaks=myBreaks2)

Volcano Plots
res.2.filtered %>%
ggplot(aes(x=log2FoldChange, y=-log10(pvalue), label=rownames(res.2.filtered))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red"))+
geom_text_repel()

Heatmap of genes involved in cisplatin resistance
Plot heatmap of genes annotated as involved in cisplatin resistance
in PMID: 34645978
resist.genes <- read.delim("ResistanceGenes.txt", header = FALSE)
resist.genes <- resist.genes$V1
pheatmap(TPM.log[resist.genes[resist.genes %in% row.names(TPM.log)],], labels_col=colData(dds)[,c("CellLine")],annotation_col=df, color=colorRampPalette(c("white", "red"))(50))

res2.resist <- res2[resist.genes[resist.genes %in% row.names(res2)],c("log2FoldChange","padj")]
res2.resist <- as.data.frame(res2.resist) %>%
filter(padj < 0.05)
res2.resist
Heatmap with mean-centered data
myBreaks3 <- c(seq(min(vsd.meancenter[resist.genes[resist.genes %in% row.names(vsd.meancenter)],]), 0, length.out=ceiling(50/2) + 1),
seq(max(vsd.meancenter[resist.genes[resist.genes %in% row.names(vsd.meancenter)],])/50, max(vsd.meancenter[resist.genes[resist.genes %in% row.names(vsd.meancenter)],]), length.out=floor(50/2)))
pheatmap(vsd.meancenter[resist.genes[resist.genes %in% row.names(vsd.meancenter)],], labels_col=colData(dds)[,c("CellLine")],annotation_col=df, color=color, border_color = NA, breaks=myBreaks3)

---
title: "Ovarian Cancer Cell Line RNAseq Platinum Sensitivity"
output: html_notebook
---

## Introduction  

RNA-seq was run on 36 ovarian cancer cell lines, each in singlicate.  

All 36 cell lines have 72h cisplatin IC50s determined by Kristin Adams and Kendra 
Wendt in the Lang Lab. 

![Cisplatin IC50s](CisplatinIC50.jpg){height=50%, width=50%}

Add in links to Lab notebooks for IC50, RNAseq sample prep

## Inputs

Inputs consisted of:  

* Metadata spreadsheet  
* salmon.merged.gene_counts.tsv file from nf-core/rnaseq pipeline output  
* Necessary packages  

```{r load packages, include=FALSE}
library(tidyverse)
library(readxl)
library(DESeq2)
library(vsn)
library(pheatmap)
library(RColorBrewer)
library(ggrepel)
library(dendextend)
```

## DESeq2

To determine genes differentially expressed between cisplatin sensitive and resistant cell lines, we used the median cisplatin IC50 of all 36 cell lines as a cut-point, and excluded cell lines within +/- one standard deviation of the median. These were defined in the metadata table.

### Read in metadata table

```{r load metatable}
as.data.frame(read_excel("Metadata3.xlsx")) -> metadata
row.names(metadata) <- metadata$files
metadata
```
### Load count matrix

```{r read count matrix}
countmatrix <- as.matrix(read.delim("../star_salmon/salmon.merged.gene_counts.tsv", sep="\t", row.names="gene_id"))
countmatrix <- countmatrix[,-1]
countmatrix2 <- matrix(as.numeric(countmatrix), ncol = ncol(countmatrix), dimnames = list(rownames(countmatrix), colnames(countmatrix)))
countmatrix2 <- round(countmatrix2)
countmatrix2 <- countmatrix2[,rownames(metadata)]
head(countmatrix2)
```
### Create DESeqDataSet object dds

```{r DESeqDataSet generation}
dds <- DESeqDataSetFromMatrix(countData = countmatrix2, colData = metadata, design = ~ Subtype + PlatinumSensitivity)
```
### Pre-filtering
This step removes genes with low expression to increase multiple comparison power.

```{r}
keep <- rowSums(counts(dds)) >= 500
dds <- dds[keep,]
nrow(dds)
```


### Run DESeq2 
```{r DESeq2}
dds <- DESeq(dds)
```

### Print DESeq results
```{r create results table}
res <- results(dds, contrast=c("PlatinumSensitivity", "resistant", "sensitive"))
res
write.csv(as.data.frame(res), file = "DESeq.csv")
```

### Filter DESeq2 results for significant genes
Filter res for padj < 0.05 and |log2FC| >= 1.2
```{r filter res}
res.filtered <- as.data.frame(res) %>%
  filter(padj<0.05)%>%
  filter(log2FoldChange >= 1.2 | log2FoldChange <= -1.2)
res.filtered
```
## Data QC

### Data transformation 
Here we performed normal transformation [log2(n+1)], variance stabilized transformation, and regularized log tranformation to improve visualization of the data values. To speed up subsequent re-runs, we have hidden analysis for non-vst.

```{r norm transformation and stdev plot}
# ntd <- normTransform(dds)
#meanSdPlot(assay(ntd))
```


```{r var stab transformation and stdev plot}
vsd <- vst(dds)
meanSdPlot(assay(vsd))
```


```{r reg log transformation and stdev plot}
# rld <- rlog(dds)
# meanSdPlot(assay(rld))
```

Based on this data, variance-stabilized tranformation lead to the lowest standard deviation between samples and was mostly located towards the high expression transcripts, as might be expected.

### PCA plot
```{r vsd PCA}
pcaData <- plotPCA(vsd, intgroup=c("Subtype", "PlatinumSensitivity","CellLine"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
myColors <- c("#76AB7E", "#63E678", "#1D32FB", "#7B87FD", "#01BD1F", "#E8A426", "#0B7C1D", "#BB19E7")
names(myColors) <- levels(pcaData$Subtype)
colScale <- scale_colour_manual(name = "Subtype",values = myColors)
pca <- ggplot(pcaData, aes(PC1, PC2, color=Subtype, shape=PlatinumSensitivity, label=CellLine)) +
  geom_point(size=3) +
  geom_text(hjust=0, vjust=0) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed() +
  theme_classic() + 
  colScale
pca
ggsave("pca.pdf", pca)
```
PCA plot groups samples roughly by subtype

### Sample-wise correlation

```{r, fig.height=6, fig.width= 8}
vsd.mod <- vsd.df 
colnames(vsd.mod) <- metadata$CellLine[match(colnames(vsd.mod), metadata$files)]
vsd.mod$gene <- row.names(vsd.mod)

corr <- vsd.mod %>%
  select(-gene) %>%
  cor(method = "spearman")

df.mod <- df
row.names(df.mod) <- metadata$CellLine[match(row.names(df.mod), metadata$files)]

pheatmap(corr, annotation_col=df.mod)
```

### Barnes RF approach?
Contacted PI 10/27/2022 because script is not on github. No response as of 12/16/2022.


### Plot heatmaps to check sample to sample variability
Plotting the top 100 most highly expressed genes:
```{r heatmaps}
select <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:100]
df <- as.data.frame(colData(dds)[,c("Subtype", "PlatinumSensitivity")])
pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
         labels_col=colData(dds)[,c("CellLine")],annotation_col=df)
```
### Dendrogram based on gene expression
Pull genes contributing to principal components 1 & 2
```{r}
TPM <- as.matrix(read.delim("../star_salmon/salmon.merged.gene_tpm.tsv", sep="\t", row.names="gene_id"))
TPM <- TPM[,-1]
TPM <- matrix(as.numeric(TPM), ncol = ncol(TPM), dimnames = list(rownames(TPM), colnames(TPM)))
TPM <- round(TPM)
TPM <- TPM[,rownames(metadata)]
TPM2 <- TPM
colnames(TPM2) <- metadata$CellLine[match(colnames(TPM2), metadata$files)]
TPM.log <- log(TPM+1)
PCA <- prcomp(TPM.log, scale=TRUE)
PCA.mat <- as.data.frame(PCA$x)
PCA.PC1filt <- PCA.mat %>% filter(PC1 < quantile(PCA.mat[,"PC1"], .2)[[1]])
```


```{r}
TPM3 <- TPM2[rowSums(TPM2)>1000,]
counts.sc <- t(TPM3)
dist <- dist(counts.sc)
clust <- hclust(dist, method="average")
dend <- as.dendrogram(clust)
```

```{r, fig.width=10}
par(mar=c(10,2,1,1))
my_colors <- ifelse(metadata$Subtype=="HGSC", "red", 
                    ifelse(metadata$Subtype=="LGSC", "blue", 
                           ifelse(metadata$Subtype=="OCCC", "yellow", 
                                  ifelse(metadata$Subtype=="EC", "green",
                                         ifelse(metadata$Subtype=="SCCOHT","purple", 
                                                ifelse(metadata$Subtype=="MUC","orange","white"))))))
plot(dend)
colored_bars(colors = my_colors, dend = dend, rowLabels = "Subtype")
```
I tried a lot of different iterations of this (including: various cutoffs for variance of genes, genes contributing most to first and second principal components, more highly expressed genes, TPM vs vst data, clustering methods), but the fundamental problem is that isogenic pairs rarely cluster anywhere near each other, even though the PCA analysis shows this relationship. I don't know that this is a reliable method for determining relatedness/subtyping, unless a robust gene set is developed. 

## Data visualization

### Plot Heatmap of top differentially expressed genes
```{r plot heatmap, fig.height = 8, fig.width = 8}
vsd.df <-as.data.frame(assay(vsd))
pheatmap(vsd.df[rownames(res.filtered),], labels_col=colData(dds)[,c("CellLine")],annotation_col=df, color=colorRampPalette(c("white", "red"))(50))
```
### Heatmap based on TPM
```{r, fig.height = 8, fig.width = 8}
pheatmap(TPM.log[rownames(res.filtered),], labels_col=colData(dds)[,c("CellLine")], annotation_col=df, color=colorRampPalette(c("white", "red"))(10))
```
### Heatmap with mean-centered data
```{r, fig.height=7, fig.width=8}
center_scale <- function(x) {
  scale(x, scale=FALSE)
}
vsd.meancenter <- apply(vsd.df, 1, center_scale)
vsd.meancenter <-t(vsd.meancenter)
colnames(vsd.meancenter) <- colnames(vsd.df)
vsd.meancenter <- as.data.frame(vsd.meancenter) 
color <- colorRampPalette(brewer.pal(11, "PuOr"))(50)
myBreaks <- c(seq(min(vsd.meancenter[rownames(res.filtered),]), 0, length.out=ceiling(50/2) + 1), 
              seq(max(vsd.meancenter[rownames(res.filtered),])/50, max(vsd.meancenter[rownames(res.filtered),]), length.out=floor(50/2)))
pheatmap(vsd.meancenter[rownames(res.filtered),], labels_col=colData(dds)[,c("CellLine")], color=color, border_color = NA, annotation_col = df, breaks=myBreaks)
```

### Volcano Plots
```{r, fig.height=5, fig.width=10}
res.filtered %>% 
  ggplot(aes(x=log2FoldChange, y=-log10(pvalue), label=rownames(res.filtered))) +
  geom_point() +
  theme_minimal() +
  scale_color_manual(values = c("black", "blue", "red"))+
  geom_text_repel()
```
  
### Run Gene Ontology
```{r run gene ontology}

```

## Re-run DESeq using IC50 as continuous variable
### Create DESeqDataSet object dds

```{r }
dds2 <- DESeqDataSetFromMatrix(countData = countmatrix2, colData = metadata, design = ~ Subtype + IC50)
```
### Pre-filtering
This step removes genes with low expression to increase multiple comparison power.

```{r}
keep <- rowSums(counts(dds2)) >= 100
dds2 <- dds2[keep,]
nrow(dds2)
```


### Run DESeq2 
```{r }
dds2 <- DESeq(dds2)
```

### Print DESeq results
```{r }
res2 <- results(dds2)
res2
write.csv(as.data.frame(res2), file = "DESeq_IC50.csv")
```

### Filter DESeq2 results for significant genes
Filter res for padj < 0.05 and |log2FC| >= 1.2
```{r }
res.2.filtered <- as.data.frame(res2) %>%
  filter(padj<0.05)%>%
  filter(log2FoldChange >= 1.2 | log2FoldChange <= -1.2)
res.2.filtered
```

## Data QC

### Data transformation 

```{r }
vsd2 <- vst(dds2)
meanSdPlot(assay(vsd2))
```

### PCA plot
```{r}
pcaData2 <- plotPCA(vsd2, intgroup=c("Subtype", "PlatinumSensitivity","CellLine"), returnData=TRUE)
percentVar2 <- round(100 * attr(pcaData2, "percentVar"))
pca2 <- ggplot(pcaData2, aes(PC1, PC2, color=Subtype, shape=PlatinumSensitivity, label=CellLine)) +
  geom_point(size=3) +
  geom_text(hjust=0, vjust=0) +
  xlab(paste0("PC1: ",percentVar2[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar2[2],"% variance")) +
  coord_fixed() +
  theme_classic() +
  colScale
pca2
ggsave("pca_IC50.pdf", pca2)
```

## Data visualization

### Plot Heatmap of top differentially expressed genes
```{r , fig.height = 3, fig.width = 8}
vsd2.df <-as.data.frame(assay(vsd2))
color <- colorRampPalette(c("white", "red"))(40)
breaks <- seq(6,12,length.out=40)  
pheatmap(vsd2.df[rownames(res.2.filtered),], labels_col=colData(dds2)[,c("CellLine")],annotation_col=df, color=color, breaks=breaks)
```

### Heatmap with mean-centered data
```{r, fig.height=3, fig.width=8}
vsd2.meancenter <- apply(vsd2.df, 1, center_scale)
vsd2.meancenter <-t(vsd2.meancenter)
colnames(vsd2.meancenter) <- colnames(vsd2.df)
vsd2.meancenter <- as.data.frame(vsd2.meancenter) 
color <- colorRampPalette(brewer.pal(11, "PuOr"))(50)
myBreaks2 <- c(seq(min(vsd2.meancenter[rownames(res.2.filtered),]), 0, length.out=ceiling(50/2) + 1), 
              seq(max(vsd2.meancenter[rownames(res.2.filtered),])/50, max(vsd2.meancenter[rownames(res.2.filtered),]), length.out=floor(50/2)))
pheatmap(vsd2.meancenter[rownames(res.2.filtered),], labels_col=colData(dds2)[,c("CellLine")], color=color, border_color = NA, annotation_col = df, breaks=myBreaks2)
```

### Volcano Plots
```{r, fig.height=5, fig.width=10}
res.2.filtered %>% 
  ggplot(aes(x=log2FoldChange, y=-log10(pvalue), label=rownames(res.2.filtered))) +
  geom_point() +
  theme_minimal() +
  scale_color_manual(values = c("black", "blue", "red"))+
  geom_text_repel()
```






### Heatmap of genes involved in cisplatin resistance
Plot heatmap of genes annotated as involved in cisplatin resistance in PMID: 34645978
```{r, fig.height = 45, fig.width = 8}
resist.genes <- read.delim("ResistanceGenes.txt", header = FALSE)
resist.genes <- resist.genes$V1
pheatmap(TPM.log[resist.genes[resist.genes %in% row.names(TPM.log)],], labels_col=colData(dds)[,c("CellLine")],annotation_col=df, color=colorRampPalette(c("white", "red"))(50))
```

```{r}
res2.resist <- res2[resist.genes[resist.genes %in% row.names(res2)],c("log2FoldChange","padj")]
res2.resist <- as.data.frame(res2.resist) %>%
  filter(padj < 0.05)
res2.resist
```
### Heatmap with mean-centered data
```{r, fig.height=45, fig.width=8}
myBreaks3 <- c(seq(min(vsd.meancenter[resist.genes[resist.genes %in% row.names(vsd.meancenter)],]), 0, length.out=ceiling(50/2) + 1), 
              seq(max(vsd.meancenter[resist.genes[resist.genes %in% row.names(vsd.meancenter)],])/50, max(vsd.meancenter[resist.genes[resist.genes %in% row.names(vsd.meancenter)],]), length.out=floor(50/2)))
pheatmap(vsd.meancenter[resist.genes[resist.genes %in% row.names(vsd.meancenter)],], labels_col=colData(dds)[,c("CellLine")],annotation_col=df, color=color, border_color = NA, breaks=myBreaks3)
```



## Package versions
Figure out how to print all package versions used.
